.libPaths(c("/data/home/lyang/R/x86_64-redhat-linux-gnu-library/4.1",
"/data/home/bioinfo/R/R4.1.0/library_B3.13",
"/usr/lib64/R/library","/usr/share/R/library") )
dyn.load("/data/home/bioinfo/programs/hdf5-1.12.0/hdf5/lib/libhdf5_hl.so.200")
knitr::opts_knit$set(root.dir = "/data/home/lyang/Visium_spotlight")
setwd("/data/home/lyang/Visium_spotlight")
init_require_packages <- function(){
library(ggplot2)
library(SPOTlight)
library(SingleCellExperiment)
library(SpatialExperiment)
library(scater)
library(scran)
library(Seurat)
library(SeuratData)
library(SeuratDisk)
}
init_require_packages()
Load the spatial data:
rds_file='Visium_spatial.rds'
spatial <- readRDS(rds_file)
Load the single cell data.
# dataset_name = "cell2location34" Tabula in_house
dataset_name = "cell2location34"
sc_dataset_file = paste(dataset_name,"sc_dataset.rds",sep ="_")
# library(SeuratDisk)
if(file.exists(sc_dataset_file)){
rds_file= sc_dataset_file
sc_dataset <- readRDS(rds_file)
}else{
import_sc_dataset <- function()
{
# Convert("D:/minor_intership_data/TS_Lymph_Node.h5ad", dest = "h5seurat", overwrite = TRUE)
sc_dataset <- LoadH5Seurat("D:/minor_intership_data/TS_Lymph_Node.h5seurat",assays = "RNA")
sc_dataset
head(sc_dataset@meta.data, 5)
saveRDS(sc_dataset,'sc_dataset.rds')
return(sc_dataset)
}
sc_dataset = import_sc_dataset()
}
# colnames(sc_dataset@meta.data)
# table(sc_dataset@meta.data$CellType2)
# table(sc_dataset@meta.data$Subset)
Quick data exploration:
# annotation = "cell2location34" Tabula
# annotation = "cell2location34"
dataset_name = "cell2location34"
if(dataset_name == "cell2location34"){
cell_type_table = as.data.frame(table(sc_dataset$Subset))
}else if(dataset_name == "Tabula"){
cell_type_table = as.data.frame(table(sc_dataset$cell_ontology_class))
}else if(dataset_name == "cell2location44"){
cell_type_table = as.data.frame(table(sc_dataset$CellType))
}
colnames(cell_type_table) = c("cell_type","frequency")
library(ggplot2)
# Basic barplot
p <- ggplot(data=cell_type_table, aes(x=cell_type, y=frequency,fill=cell_type)) +
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
geom_text(aes(label=frequency), position=position_dodge(width=0.9), vjust=-0.25)
# ggsave(p,filename = "cell_type_histogram.pdf",width = 15,
# height = 20)
print(p)
# Keep cells with clear cell type annotations
# sce <- subset(sce, , !cell_ontology_class %in% c("nan", "CD45"))
# sc_dataset <- subset(sc_dataset, subset = cell_ontology_class != "nan")
sc_reference_markers_file = paste(dataset_name,"sc_reference_markers.rds",sep ="_")
sc_hvg_file = paste(dataset_name,"sc_hvg.rds",sep ="_")
sc_dataset_VlnPlot_file = paste(dataset_name,"sc_dataset_VlnPlot.rds",sep ="_")
if(file.exists(sc_hvg_file)){
rds_file= sc_reference_markers_file
sc_reference_markers <- readRDS(rds_file)
rds_file= sc_hvg_file
sc_hvg <- readRDS(rds_file)
}else
{
qc_filter_sc <- function(sc_dataset,dataset_name){
if(dataset_name == "Tabula"){
sc_dataset[['nCount_RNA']] = sc_dataset$n_counts_UMIs
}else if(dataset_name == "cell2location"){
sc_dataset[['nCount_RNA']] = sc_dataset$n_counts
}
mt.genes <- grep(pattern = '^MT-', x = rownames(x = sc_dataset@assays$RNA),value = TRUE)
percent.mt <- Matrix::colSums(sc_dataset@assays$RNA[mt.genes, ]) / Matrix::colSums(sc_dataset@assays$RNA)
sc_dataset <- AddMetaData(object = sc_dataset,metadata = percent.mt,
col.name = 'percent.mt')
# filter cells based on QC metrics
sc_dataset_filtered <- subset(sc_dataset, subset = n_genes > 200 & n_genes < 5000 & percent.mt < 0.05 )
return(list("sc"= sc_dataset,"sc_filtered"= sc_dataset_filtered))
}
if(dataset_name=="cell2location" | dataset_name == "cell2location34"){
a_list = qc_filter_sc(sc_dataset,"cell2location")
}else if(dataset_name=="Tabula"){
a_list = qc_filter_sc(sc_dataset,"Tabula")
}
sc_dataset = a_list$sc
# Visualize QC metrics as a violin plot
saveRDS(sc_dataset,sc_dataset_VlnPlot_file)
sc_dataset_filtered = a_list$sc_filtered
sc_dataset_filtered <- NormalizeData(sc_dataset_filtered, normalization.method = "LogNormalize", scale.factor = 10000)
head(rownames(sc_dataset_filtered))
# Get vector indicating which genes are neither ribosomal or mitochondrial
genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(sc_dataset_filtered))
if(dataset_name=="Tabula"){
Idents(sc_dataset_filtered) <- 'cell_ontology_class'
}else if(dataset_name=="cell2location"){
Idents(sc_dataset_filtered) <- 'CellType'
}else if(dataset_name=="cell2location34"){
Idents(sc_dataset_filtered) <- 'Subset'
}
head(Idents(sc_dataset_filtered), 5)
sc_dataset_filtered <- FindVariableFeatures(sc_dataset_filtered, selection.method = "vst", nfeatures = 3000)
sc_hvg =VariableFeatures(sc_dataset_filtered)
saveRDS(sc_hvg,sc_hvg_file)
sc_reference_markers <- FindAllMarkers(sc_dataset_filtered, min.pct = 0.25,
logfc.threshold = 0.25,max.cells.per.ident=200)
library(dplyr)
sc_reference_markers <- sc_reference_markers %>%
group_by(cluster) %>%
slice_max(n = 10, order_by = avg_log2FC)
saveRDS(sc_reference_markers,sc_reference_markers_file)
}
rds_file= sc_dataset_VlnPlot_file
sc_dataset_VlnPlot <- readRDS(rds_file)
VlnPlot(sc_dataset_VlnPlot, features = c("n_genes", "nCount_RNA"), ncol = 2)
sc_dataset.downsampled_file = paste(dataset_name,"sc_dataset.downsampled.rds",sep ="_")
if (file.exists(sc_dataset.downsampled_file)){
rds_file = sc_dataset.downsampled_file
sc_dataset_filtered <- readRDS(rds_file)
}else
{
# downsample to at most 100 per identity
subsample_cells <- function(sc_dataset,dataset_name)
{
if(dataset_name == "Tabula"){
Idents(sc_dataset) <- 'cell_ontology_class'
}else if(dataset_name == "cell2location"){
Idents(sc_dataset) <- 'CellType'
}else if(dataset_name == "cell2location34"){
Idents(sc_dataset) <- 'Subset'
}
table(Idents(sc_dataset))
unique(Idents(sc_dataset))
cell.list <- WhichCells(sc_dataset, idents = unique(Idents(sc_dataset)),
downsample = 1000)
gc()
sc_dataset <- sc_dataset[, cell.list]
# table(sc_dataset$cell_ontology_class)
saveRDS(sc_dataset,sc_dataset.downsampled_file)
return(sc_dataset)
}
# subsample fixed numbers of cells from differently sized cell types in a seurat object.
sc_dataset_filtered = subsample_cells(sc_dataset_filtered,dataset_name)
}
You are now set to run SPOTlight to deconvolute the spots!
getwd()
## [1] "/data/home/lyang/Visium_spotlight"
res_file = paste(dataset_name,"res.rds",sep ="_")
if(file.exists(res_file)){
rds_file = res_file
res <- readRDS(rds_file)
}else{
if(dataset_name=="cell2location34"){
res <- SPOTlight(
x = sc_dataset_filtered,
y = spatial@assays$Spatial@counts,
groups = sc_dataset_filtered$Subset,
mgs = sc_reference_markers,
hvg = sc_hvg,
weight_id = "avg_log2FC",
group_id = "cluster",
gene_id = "gene")
saveRDS(res,res_file)
}else if(dataset_name=="Tabula")
{
res <- SPOTlight(
x = sc_dataset_filtered,
y = spatial@assays$Spatial@counts,
groups = sc_dataset_filtered$cell_ontology_class,
mgs = sc_reference_markers,
hvg = sc_hvg,
weight_id = "avg_log2FC",
group_id = "cluster",
gene_id = "gene")
saveRDS(res,res_file)
}
}
# Time for training: 708.35min
# Deconvoluting mixture data
Extract data from SPOTlight:
# Extract deconvolution matrix
head(mat <- res$mat)[, seq_len(6)]
## T_CD4+_TfH DC_CCR7+ T_Treg T_CD4+_naive T_TfR
## AAACAAGTATCTCCCA-1 0.1712394 0.09197712 0 0.02533235 0.03647274
## AAACAATCTACTAGCA-1 0.0000000 0.00000000 0 0.00000000 0.00000000
## AAACACCAATAACTGC-1 0.1696150 0.03258153 0 0.10231059 0.02398280
## AAACAGAGCGACTCCT-1 0.0000000 0.00000000 0 0.00000000 0.00000000
## AAACAGCTTTCAGAAG-1 0.1644396 0.00000000 0 0.07022688 0.07221897
## AAACAGGGTCTATATT-1 0.0000000 0.00000000 0 0.00000000 0.00000000
## T_CD8+_cytotoxic
## AAACAAGTATCTCCCA-1 0.03789942
## AAACAATCTACTAGCA-1 0.00000000
## AAACACCAATAACTGC-1 0.04188913
## AAACAGAGCGACTCCT-1 0.00000000
## AAACAGCTTTCAGAAG-1 0.05083818
## AAACAGGGTCTATATT-1 0.00000000
# Extract NMF model fit
mod <- res$NMF
In the next section we show how to visualize the data and interpret SPOTlight’s results.
We first take a look at the Topic profiles. By setting facet = FALSE we want to evaluate how specific each topic signature is for each cell identity. Ideally each cell identity will have a unique topic profile associated to it as seen below.
Next we also want to ensure that all the cells from the same cell identity share a similar topic profile since this will mean that SPOTlight has learned a consistent signature for all the cells from the same cell identity.
# ggplot(df, aes_string(x, "topic",
# col = "weight")) +
# geom_point() +
# facet_wrap(~group, ncol = 5, scales = "free_x")+
# scale_y_continuous(breaks = seq(0,35,1))
if(dataset_name == "Tabula"){
cell_type = sc_dataset_filtered$cell_ontology_class
}else if(dataset_name == "cell2location"){
cell_type = sc_dataset_filtered$CellType
}else if(dataset_name == "cell2location34"){
cell_type = sc_dataset_filtered$Subset
}
# cell_type variable is a vector of group labels for each cell in sc reference dataset
plotTopicProfiles(
x = mod,
y = cell_type,
facet = TRUE,
min_prop = 0.2,
ncol = 5)
plotTopicProfiles2 <- function(x, y,
facet = FALSE,
min_prop = 0.1,
ncol = NULL) {
y <- as.character(y)
# get group proportions
mat <- prop.table(t(coef(x)), 1)
if (facet) {
} else {
# get topic medians
df <- aggregate(mat, list(y), mean)
group = df[,1]
df <- df[,-1]
# stretch for plotting
df <- data.frame(
weight = unlist(df),
group = rep(group, ncol(df) ),
topic = rep(seq_len(nrow(df)), each = nrow(df))
)
# set aesthetics
x <- "group"
f <- NULL
}
# fix topic order
df$topic <- factor(df$topic, seq_along(unique(y)))
# render plot
ggplot(data = df, mapping = aes(x = group, y = topic, col = weight,
size = weight)) + geom_point()+
# guides(col = guide_legend(override.aes = list(size = 2))) +
scale_color_continuous(low = "red", high = "blue") +
scale_size_continuous(range = c(0, 5)) +
scale_fill_continuous(limits=c(0, 1.25), breaks=seq(0,1,by=0.25))+
# xlab(if (facet) x) +
theme_bw() +
theme(
panel.grid = element_blank(),
# legend.key.size = unit(0.5, "lines"),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1))
}
plotTopicProfiles2(
x = mod,
y = cell_type,
facet = FALSE,
min_prop = 0.01,
ncol = 1)
# +
# theme(aspect.ratio = 1,axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
Lastly we can take a look at which genes the model learned for each topic. Higher values indicate that the gene is more relevant for that topic. In the below table we can see how the top genes for Topic1 are characteristic for B cells (i.e. Cd79a, Cd79b, Ms4a1…).
library(NMF)
sign <- basis(mod)
colnames(sign) <- paste0("Topic", seq_len(ncol(sign)))
head(sign)
# This can be dynamically visualized with DT as shown below
# DT::datatable(sign, fillContainer = TRUE, filter = "top")
See here for additional graphical parameters.
library(ggcorrplot)
plotCorrelationMatrix(mat)
Now that we know which cell types are found within each spot we can make a graph representing spatial interactions where cell types will have stronger edges between them the more often we find them within the same spot.
See here for additional graphical parameters.
# plotInteractions(mat, "heatmap")
# plotInteractions(mat, "network")
DimPlot(sc_dataset_filtered, group.by = "Subset", label = TRUE)
# DimPlot(sc_dataset, group.by = "cell_ontology_class")
# PCAPlot(sc_dataset, group.by = "cell_ontology_class")
# UMAPPlot(sc_dataset, group.by = "cell_ontology_class")
We can also visualize the cell type proportions as sections of a pie chart for each spot. You can modify the colors as you would a standard .
# This can be dynamically visualized with DT as shown below
ct <- colnames(mat)
mat[mat < 0.1] <- 0
library(RColorBrewer)
if(dataset_name == "Tabula"){
n <- 29
}else if(dataset_name == "cell2location"){
n <- 44
}else if(dataset_name == "cell2location34"){
n <- 34
}
colrs <- brewer.pal.info[brewer.pal.info$colorblind == TRUE, ]
col_vec = unlist(mapply(brewer.pal, colrs$maxcolors, rownames(colrs)))
set.seed(123)
col <- sample(col_vec, n)
# Define color palette
# (here we use 'paletteMartin' from the 'colorBlindness' package)
paletteMartin <- col
pal <- colorRampPalette(paletteMartin)(length(ct))
names(pal) <- ct
plotSpatialScatterpie(
x = spatial,
y = mat,
cell_types = colnames(y),
img = FALSE,
scatterpie_alpha = 1,
pie_scale = 0.4) +
scale_fill_manual(
values = pal,
breaks = names(pal))
Lastly we can also take a look at how well the model predicted the proportions for each spot. We do this by looking at the residuals of the sum of squares for each spot which indicates the amount of biological signal not explained by the model.
# # spatial_dir = 'D:/minor_intership_data/Visium_Spatial_Lymph_Node_Data/data'
# # spatial <- read10xVisium(spatial_dir,
# # type = "HDF5", data = "raw",
# # images = "lowres",load = FALSE)
# # saveRDS(spatial,"spatial_residual.rds")
#
# rds_file = "spatial_residual.rds"
# spatial_residual <- readRDS(rds_file)
#
# spatial_residual$res_ss <- res[[2]][colnames(spatial_residual)]
# xy <- spatialCoords(spatial_residual)
# spatial_residual$x <- xy[, 1]
# spatial_residual$y <- xy[, 2]
# detach("package:SeuratObject", unload=TRUE)
#
# ggcells(spatial_residual, aes(x, y, color = res_ss)) +
# geom_point() +
# scale_color_viridis_c() +
# coord_fixed() +
# theme_bw()
rds_file='predictions.assay.rds'
predictions.assay <- readRDS(rds_file)
predictions.assay@data <- t(mat)
meta.features <- as.data.frame(colnames(mat) )
rownames(meta.features) = meta.features[,1]
meta.features$`colnames(mat)` =NULL
predictions.assay@meta.features = meta.features
spatial[["predictions"]] <- predictions.assay
DefaultAssay(spatial) <- "predictions"
table(sc_dataset@meta.data$Subset)
##
## B_Cycling B_GC_DZ B_GC_LZ B_GC_prePB
## 4765 2500 3298 74
## B_IFN B_activated B_mem B_naive
## 199 3575 13476 8924
## B_plasma B_preGC DC_CCR7+ DC_cDC1
## 1094 404 42 101
## DC_cDC2 DC_pDC Endo FDC
## 173 226 622 76
## ILC Macrophages_M1 Macrophages_M2 Mast
## 617 121 110 18
## Monocytes NK NKT T_CD4+
## 306 1372 896 3059
## T_CD4+_TfH T_CD4+_TfH_GC T_CD4+_naive T_CD8+_CD161+
## 4690 3653 6012 2294
## T_CD8+_cytotoxic T_CD8+_naive T_TIM3+ T_TfR
## 3890 2253 357 1065
## T_Treg VSMC
## 2958 40
if(dataset_name == "Tabula"){
# p = SpatialFeaturePlot(spatial, features = c("stromal cell", "t cell"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
# print(p)
# p = SpatialFeaturePlot(spatial, features = c("naive b cell", "b cell"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
# print(p)
}else if(dataset_name == "cell2location"){
# SpatialFeaturePlot(spatial, features = c("T_Treg", "DC_cDC2"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
# SpatialFeaturePlot(spatial, features = c("DC_cDC1", "DC_cDC2"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
}else if(dataset_name == "cell2location34"){
detach("package:SpatialExperiment", unload=TRUE)
p = SpatialFeaturePlot(spatial, features = c("T_CD4+", "T_CD8+_CD161+","B_GC_LZ"), pt.size.factor = 1.6, ncol = 3, crop = TRUE)
print(p)
p = SpatialFeaturePlot(spatial, features = c("B_plasma", "DC_cDC1"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
print(p)
SpatialFeaturePlot(spatial, features = c("VSMC", "B_Cycling"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)
}
# table(sc_dataset@meta.data$Subset)
# sessionInfo()